import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('CC GENERAL.csv')
df1=df.copy()
df1.head(10)
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| 5 | C10006 | 1809.828751 | 1.000000 | 1333.28 | 0.00 | 1333.28 | 0.000000 | 0.666667 | 0.000000 | 0.583333 | 0.000000 | 0 | 8 | 1800.0 | 1400.057770 | 2407.246035 | 0.000000 | 12 |
| 6 | C10007 | 627.260806 | 1.000000 | 7091.01 | 6402.63 | 688.38 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0 | 64 | 13500.0 | 6354.314328 | 198.065894 | 1.000000 | 12 |
| 7 | C10008 | 1823.652743 | 1.000000 | 436.20 | 0.00 | 436.20 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0 | 12 | 2300.0 | 679.065082 | 532.033990 | 0.000000 | 12 |
| 8 | C10009 | 1014.926473 | 1.000000 | 861.49 | 661.49 | 200.00 | 0.000000 | 0.333333 | 0.083333 | 0.250000 | 0.000000 | 0 | 5 | 7000.0 | 688.278568 | 311.963409 | 0.000000 | 12 |
| 9 | C10010 | 152.225975 | 0.545455 | 1281.60 | 1281.60 | 0.00 | 0.000000 | 0.166667 | 0.166667 | 0.000000 | 0.000000 | 0 | 3 | 11000.0 | 1164.770591 | 100.302262 | 0.000000 | 12 |
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8950 entries, 0 to 8949 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUST_ID 8950 non-null object 1 BALANCE 8950 non-null float64 2 BALANCE_FREQUENCY 8950 non-null float64 3 PURCHASES 8950 non-null float64 4 ONEOFF_PURCHASES 8950 non-null float64 5 INSTALLMENTS_PURCHASES 8950 non-null float64 6 CASH_ADVANCE 8950 non-null float64 7 PURCHASES_FREQUENCY 8950 non-null float64 8 ONEOFF_PURCHASES_FREQUENCY 8950 non-null float64 9 PURCHASES_INSTALLMENTS_FREQUENCY 8950 non-null float64 10 CASH_ADVANCE_FREQUENCY 8950 non-null float64 11 CASH_ADVANCE_TRX 8950 non-null int64 12 PURCHASES_TRX 8950 non-null int64 13 CREDIT_LIMIT 8949 non-null float64 14 PAYMENTS 8950 non-null float64 15 MINIMUM_PAYMENTS 8637 non-null float64 16 PRC_FULL_PAYMENT 8950 non-null float64 17 TENURE 8950 non-null int64 dtypes: float64(14), int64(3), object(1) memory usage: 1.2+ MB
Drop rows with null values and drop CUST_ID column
df1.dropna(inplace=True)
df1.drop('CUST_ID', axis=1, inplace=True)
#Check number of null values
df1.isna().sum()
BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
Draw a box plot to highlight the distribution of outliers in each column.
import plotly.express as px
px.box(df1)
Now we'll see how each column in the dataframe is distributed.
plt.figure(figsize=(20,35))
for i, col in enumerate(df1.columns):
if df1[col].dtype!='object':
ax = plt.subplot(9, 2, i+1)
sns.kdeplot(df1[col], ax=ax)
plt.xlabel(col)
df1.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 |
| mean | 1601.224893 | 0.895035 | 1025.433874 | 604.901438 | 420.843533 | 994.175523 | 0.496000 | 0.205909 | 0.368820 | 0.137604 | 3.313918 | 15.033233 | 4522.091030 | 1784.478099 | 864.304943 | 0.159304 | 11.534391 |
| std | 2095.571300 | 0.207697 | 2167.107984 | 1684.307803 | 917.245182 | 2121.458303 | 0.401273 | 0.300054 | 0.398093 | 0.201791 | 6.912506 | 25.180468 | 3659.240379 | 2909.810090 | 2372.566350 | 0.296271 | 1.310984 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.049513 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 148.095189 | 0.909091 | 43.367500 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 418.559237 | 169.163545 | 0.000000 | 12.000000 |
| 50% | 916.855459 | 1.000000 | 375.405000 | 44.995000 | 94.785000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 896.675701 | 312.452292 | 0.000000 | 12.000000 |
| 75% | 2105.195853 | 1.000000 | 1145.980000 | 599.100000 | 484.147500 | 1132.385490 | 0.916667 | 0.333333 | 0.750000 | 0.250000 | 4.000000 | 18.000000 | 6500.000000 | 1951.142090 | 825.496463 | 0.166667 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
To deal with the skewness of data in columns, we use Log Transformation.
for col in df1.columns:
df1[col] = np.log(1 + df1[col])
Display the difference in distribution after log transformation
plt.figure(figsize=(20,35))
for i, col in enumerate(df1.columns):
if df1[col].dtype!='object':
ax = plt.subplot(9, 2, i+1)
sns.kdeplot(df1[col], ax=ax)
plt.xlabel(col)
Then we look at the dataset's features to see the correlation between them.
plt.figure(figsize=(12,12))
sns.heatmap(df1.corr(), annot=True)
plt.show()
To get the most significant components, we use PCA as a dimensionality reduction method.
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
Res_PCA = pca.fit_transform(df1)
df_PCA = pd.DataFrame(Res_PCA, columns = ['Comp1','Comp2','Comp3','Comp4','Comp5'])
df_PCA.head(10)
| Comp1 | Comp2 | Comp3 | Comp4 | Comp5 | |
|---|---|---|---|---|---|
| 0 | 0.995208 | -5.729093 | 0.598730 | 0.526043 | 0.674032 |
| 1 | -9.015723 | 1.293468 | 0.858426 | -0.879599 | -1.484887 |
| 2 | 2.589857 | 1.240795 | -5.040086 | -2.434489 | 0.921386 |
| 3 | -1.262858 | -2.881455 | -3.916192 | -2.154545 | -0.108838 |
| 4 | 3.358507 | -2.725612 | 3.879279 | -3.236283 | 1.078897 |
| 5 | 8.185076 | 2.760389 | -0.932128 | -0.269855 | -1.975570 |
| 6 | 2.437408 | -3.302516 | 2.812188 | -2.667473 | 0.874665 |
| 7 | 4.879882 | 0.481023 | -1.272693 | -1.222344 | -0.500349 |
| 8 | 3.149352 | 0.289802 | -5.905517 | 0.387657 | -0.351562 |
| 9 | 3.127430 | -2.996327 | 3.562062 | -2.942204 | 1.257468 |
Now, we have two approaches to cluster the data:
1- Perform embedding using TSNE Algorithm, then cluster the result from TSNE using any of the clustering algorithms.
2- Perform clustering for the data in high dimension which is not suitable for some algorithms then embedding the data using TSNE and visualize the clustering result using labels we have got before applying TSNE.
Now, we will try each of the twwo approaches and see which one will give better clusters
Use TSNE Algorithm fior embedding (Moving from high-dimension space to Low-dimension one)
from sklearn.manifold import TSNE
tsne_projection = TSNE(n_components=2,
perplexity=50,
n_iter=10**4,
early_exaggeration = 12,
init='random',
random_state=42).fit_transform(df_PCA)
tsne_projection = pd.DataFrame(tsne_projection)
tsne_projection
| 0 | 1 | |
|---|---|---|
| 0 | -5.055622 | 71.688934 |
| 1 | -17.306173 | -64.952103 |
| 2 | -86.968208 | 46.928871 |
| 3 | -64.506699 | 39.624664 |
| 4 | 26.037098 | 112.655975 |
| ... | ... | ... |
| 8631 | -78.645218 | -0.355160 |
| 8632 | -13.115753 | 85.055603 |
| 8633 | -11.645321 | 68.451218 |
| 8634 | -53.983891 | -29.189922 |
| 8635 | -117.185425 | -32.201927 |
8636 rows × 2 columns
from sklearn.cluster import KMeans
#Use Elbow mwethod to check the best number of clusters
kmeans_models = [KMeans(n_clusters=k, random_state=42).fit(tsne_projection) for k in range (1, 10)]
innertia = [model.inertia_ for model in kmeans_models]
plt.plot(range (1, 10), innertia)
plt.title('Elbow method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()
From Elbow method if we choose 5 number of clusters
choosed_model = KMeans(n_clusters=5, random_state=42).fit(tsne_projection)
choosed_model.labels_
array([0, 2, 3, ..., 0, 2, 3])
cluster_labels = ["cluster" + str(label)
for label in choosed_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
We will try different number of clusters to get the best one using silhouette_score as the metric that measures the accuracy of clustering
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,10))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = cluster_model.fit(tsne_projection).labels_
silhouette_avg = silhouette_score(tsne_projection, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea44bd8e0>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Choose number of clusters that give the best silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,15))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
cluster_labels = cluster_model.fit_predict(tsne_projection)
silhouette_avg = silhouette_score(tsne_projection, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea1de6a00>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Here we will detect anomalies through this algorithm for result of TSNE
from sklearn.cluster import DBSCAN
db_default = DBSCAN(eps = 3.5, min_samples = 5).fit(tsne_projection)
labels = db_default.labels_
cluster_labels = ["cluster (" + str(label) + ")"
for label in labels]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Only one point is detected as anomaly
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,15))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = GaussianMixture(n_components = n_clusters)
cluster_labels = cluster_model.fit_predict(tsne_projection)
silhouette_avg = silhouette_score(tsne_projection, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
labels = cluster_labels
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_components, color='black')
<matplotlib.lines.Line2D at 0x19ea4531ee0>
cluster_labels = ["cluster (" + str(label) + ")"
for label in labels]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
from sklearn.ensemble import IsolationForest
model = IsolationForest(n_estimators=100, contamination=0.1, max_features=0.7)
labels = model.fit_predict(tsne_projection)
cluster_labels = ["cluster (" + str(label) + ")"
for label in labels]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,10))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = cluster_model.fit(df_PCA).labels_
silhouette_avg = silhouette_score(df_PCA, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea41829d0>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Here we can see that KMeans has better performance in clustering with first approach.
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,15))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
cluster_labels = cluster_model.fit_predict(df_PCA)
silhouette_avg = silhouette_score(df_PCA, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea44cbcd0>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
There is no big difference between two approaches here with this algorithm
from sklearn.cluster import DBSCAN
db_default = DBSCAN(eps = 1.2, min_samples = 5).fit(df_PCA)
labels = db_default.labels_
cluster_labels = ["cluster (" + str(label) + ")"
for label in labels]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
In this approach, the number of anomalies is increased remarkably.
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit_transform(df_PCA)
xs = pd.DataFrame(scaled_data, columns = df_PCA.columns)
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,15))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = GaussianMixture(n_components = n_clusters)
cluster_labels = cluster_model.fit_predict(xs)
silhouette_avg = silhouette_score(xs, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
labels = cluster_labels
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_components, color='black')
<matplotlib.lines.Line2D at 0x19ea3f4aee0>
cluster_labels = ["cluster (" + str(label) + ")"
for label in labels]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
From the clustering above, I can see that the first approach is better than this one.
from sklearn.ensemble import IsolationForest
model = IsolationForest(n_estimators=100, contamination=0.1, max_features=0.7)
labels = model.fit_predict(df_PCA)
cluster_labels = ["cluster (" + str(label) + ")"
for label in labels]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
I can't decide here which one is better in anomaly detection but may be it depends on the application in which we use this algorithm
I found this link while I was searching in that area https://stats.stackexchange.com/questions/263539/clustering-on-the-output-of-t-sne
This Will be in a separate notebook with name "Credit Card Clustering + Kernel PCA.ipynb"
We will use only one algorithm "KMeans" for clustering in the following part.
1- Remove nulls
2- Drop CUST_ID column
df2 = df.copy()
df2.dropna(inplace=True)
df2.drop('CUST_ID', axis=1, inplace=True)
2- Robust Scaler which is suitable for dataserts with skewed distributions and outliers because it transforms the data based on the median and quantile
from sklearn.preprocessing import RobustScaler
model = RobustScaler()
df2 = pd.DataFrame(model.fit_transform(df2), columns = df2.columns)
plt.figure(figsize=(20,35))
for i, col in enumerate(df2.columns):
if df2[col].dtype!='object':
ax = plt.subplot(9, 2, i+1)
sns.kdeplot(df2[col], ax=ax)
plt.xlabel(col)
from sklearn.manifold import TSNE
tsne_projection2 = TSNE(n_components=2,
perplexity=50,
n_iter=10**4,
early_exaggeration = 12,
init='random',
random_state=42).fit_transform(df2)
tsne_projection2 = pd.DataFrame(tsne_projection2)
tsne_projection2
| 0 | 1 | |
|---|---|---|
| 0 | 47.569439 | 3.619377 |
| 1 | 12.108082 | -51.948540 |
| 2 | -34.217415 | 35.600784 |
| 3 | -59.729904 | -9.096209 |
| 4 | -66.119415 | 34.207920 |
| ... | ... | ... |
| 8631 | 82.136665 | -45.170609 |
| 8632 | 61.986465 | 49.638016 |
| 8633 | 70.225571 | -44.754032 |
| 8634 | 70.163155 | -46.208984 |
| 8635 | 76.147491 | -43.901592 |
8636 rows × 2 columns
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,10))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = cluster_model.fit(df2).labels_
silhouette_avg = silhouette_score(df2, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea425f940>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection[0],
y=tsne_projection[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
With only Robust scaling Data is not well clustered so let's try robust + pca
Robust Scaling + PCA
from sklearn.decomposition import PCA
pca2 = PCA(n_components=0.95)
Res_PCA2 = pca2.fit_transform(df2)
df2_PCA = pd.DataFrame(Res_PCA2)
df2_PCA.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -2.596229 | -0.171497 | -1.071034 | 0.059702 | 0.860294 | 0.891342 | 0.527637 | -0.040093 | 0.517276 |
| 1 | -0.325228 | 1.774175 | 3.789277 | 1.530645 | -0.758796 | -1.037554 | 1.598221 | -1.715482 | -0.951382 |
| 2 | -0.233322 | -0.315416 | -0.519626 | -1.795153 | 1.523431 | 0.084125 | 0.540203 | 0.183529 | -1.315513 |
| 3 | -2.148961 | 0.150073 | -0.421820 | -1.629857 | 0.983295 | 0.261734 | 0.742518 | -0.468836 | 0.578381 |
| 4 | 1.021615 | 2.109443 | -1.616171 | -1.537228 | -0.991736 | 1.347693 | 0.362237 | -0.607528 | 1.165754 |
| 5 | 9.490358 | -7.986251 | -1.161180 | 0.342110 | 1.414507 | -4.638349 | 0.132984 | 0.817816 | -0.843175 |
| 6 | -1.251773 | 0.222144 | -0.716265 | -1.871176 | -0.007358 | 0.803081 | 0.514988 | -0.318970 | 0.147764 |
| 7 | -0.889073 | -0.477214 | -0.475818 | -1.601296 | 1.113433 | 0.402490 | 0.597305 | -0.321474 | 0.415976 |
| 8 | -1.067515 | -1.656608 | -1.478763 | 3.093893 | 2.005896 | 1.390732 | 0.608749 | 0.405519 | -0.070094 |
| 9 | 0.446429 | 1.956605 | -1.573777 | -1.629534 | -0.586921 | 0.993940 | 0.366444 | -0.412694 | 0.664021 |
from sklearn.manifold import TSNE
tsne_projection2 = TSNE(n_components=2,
perplexity=50,
n_iter=10**4,
early_exaggeration = 12,
init='random',
random_state=42).fit_transform(df2_PCA)
tsne_projection2 = pd.DataFrame(tsne_projection2)
tsne_projection2
| 0 | 1 | |
|---|---|---|
| 0 | 47.569439 | 3.619377 |
| 1 | 12.108082 | -51.948540 |
| 2 | -34.217415 | 35.600784 |
| 3 | -59.729904 | -9.096209 |
| 4 | -66.119415 | 34.207920 |
| ... | ... | ... |
| 8631 | 82.136665 | -45.170609 |
| 8632 | 61.986465 | 49.638016 |
| 8633 | 70.225571 | -44.754032 |
| 8634 | 70.163155 | -46.208984 |
| 8635 | 76.147491 | -43.901592 |
8636 rows × 2 columns
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,10))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = cluster_model.fit(df2_PCA).labels_
silhouette_avg = silhouette_score(df2_PCA, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea2477d30>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection2[0],
y=tsne_projection2[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
This preprocessing failed to cluster the data well.
Robust Scaler + Log Transformation + PCA
df2 = df.copy()
df2.dropna(inplace=True)
df2.drop('CUST_ID', axis=1, inplace=True)
for col in df2.columns:
df2[col] = np.log1p(df2[col])
from sklearn.preprocessing import RobustScaler
model = RobustScaler()
df_scaled = pd.DataFrame(model.fit_transform(df2), columns = df2.columns)
from sklearn.decomposition import PCA
pca2 = PCA(n_components=0.95)
Res_PCA2 = pca2.fit_transform(df_scaled)
df2_PCA = pd.DataFrame(Res_PCA2)
df2_PCA.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|---|
| 0 | 1.110265 | -0.519702 | -0.572972 | -1.880267 | -0.000515 | 0.145439 | 0.034131 |
| 1 | -0.459101 | -1.524378 | -1.145827 | 1.774767 | -0.036560 | -0.979541 | 0.246700 |
| 2 | -1.554687 | 0.540317 | 1.435146 | -0.514068 | -1.632920 | -0.306733 | -0.352524 |
| 3 | -1.131166 | -0.837587 | -0.679155 | -1.214444 | -0.765469 | -0.580396 | 0.063861 |
| 4 | -1.533254 | 0.100761 | 0.649912 | -0.844083 | 1.171807 | -0.997549 | -0.521244 |
| 5 | -1.055355 | 4.527684 | -0.247321 | 1.969914 | -0.805688 | -0.307641 | 0.231320 |
| 6 | -1.387107 | 0.232935 | 0.344453 | -1.264122 | 1.068622 | -0.471225 | 0.042351 |
| 7 | -1.305344 | 0.188775 | 0.537555 | -0.971108 | -0.158880 | -0.530621 | 0.417173 |
| 8 | 4.213962 | -0.333132 | 1.509802 | -0.280814 | -0.807276 | -0.478799 | 0.695110 |
| 9 | -1.509064 | 0.246259 | 0.593578 | -1.086442 | 1.392355 | -0.730671 | -0.611156 |
from sklearn.manifold import TSNE
tsne_projection2 = TSNE(n_components=2,
perplexity=50,
n_iter=10**4,
early_exaggeration = 12,
init='random',
random_state=42).fit_transform(df2_PCA)
tsne_projection2 = pd.DataFrame(tsne_projection2)
tsne_projection2
| 0 | 1 | |
|---|---|---|
| 0 | 37.872433 | 21.295618 |
| 1 | 31.648838 | -71.890198 |
| 2 | -47.247223 | 33.345055 |
| 3 | -22.154964 | -0.229090 |
| 4 | -9.548319 | 36.634544 |
| ... | ... | ... |
| 8631 | 64.153000 | -11.419135 |
| 8632 | 34.676113 | 69.924522 |
| 8633 | 46.546764 | 35.470074 |
| 8634 | 36.856972 | -50.178875 |
| 8635 | 35.841660 | -16.385178 |
8636 rows × 2 columns
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea2799700>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection2[0],
y=tsne_projection2[1],
text=tsne_projection2.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Standard scaler + PCA
df3 = df.copy()
df3.dropna(inplace=True)
df3.drop('CUST_ID', axis=1, inplace=True)
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit_transform(df3)
df_scaled = pd.DataFrame(scaled_data, columns = df3.columns)
from sklearn.decomposition import PCA
pca3 = PCA(n_components=0.95)
Res_PCA3 = pca2.fit_transform(df_scaled)
df3_PCA = pd.DataFrame(Res_PCA3)
df3_PCA.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.696395 | -1.122584 | 0.491562 | 0.719521 | 0.079830 | 0.118234 | 0.808993 | -0.093970 | -0.016190 | -0.082402 | -0.216529 | -0.053144 |
| 1 | -1.215681 | 2.435638 | 0.694658 | -0.098843 | 0.803019 | -0.917777 | -0.322969 | -0.045119 | 0.754617 | -0.748468 | -0.878351 | -0.592741 |
| 2 | 0.935853 | -0.385202 | -0.025953 | 1.293844 | -1.987285 | -0.682139 | -1.624721 | 0.073401 | -0.837066 | -0.034854 | -0.746798 | 0.695779 |
| 3 | -1.614638 | -0.724586 | 0.272358 | 1.086116 | -0.427814 | 0.082982 | 0.687001 | 0.063548 | 0.566940 | -0.083532 | -0.466749 | -0.119286 |
| 4 | 0.223701 | -0.783610 | -1.184434 | 0.721353 | 0.801243 | 0.525879 | 0.788893 | -0.089942 | 0.365857 | -0.192647 | -0.194651 | 0.332570 |
| 5 | 6.265598 | -0.609449 | 2.085564 | -0.577785 | -0.965617 | -1.091335 | -1.369075 | 1.058575 | 0.759205 | -0.030365 | 1.150347 | 0.021525 |
| 6 | 0.261667 | -1.295635 | -1.825458 | 0.066776 | 0.219371 | -0.101161 | 0.416597 | -0.523560 | 0.024977 | -0.947101 | 0.558480 | -0.064619 |
| 7 | -0.465339 | -0.477698 | 0.097163 | 1.061918 | -0.158330 | -0.296466 | -0.079364 | -0.598619 | 0.597367 | -0.054759 | 0.188932 | 0.416755 |
| 8 | -0.599681 | -0.408591 | 2.017645 | 1.043335 | 0.573139 | -0.559965 | -0.768473 | -1.018236 | -0.687633 | 0.214814 | 0.512263 | 0.794538 |
| 9 | 0.522771 | -1.312163 | -1.942541 | 0.168927 | 0.650792 | 0.330538 | 0.715911 | -0.058458 | -0.020553 | -0.834802 | 0.279073 | 0.250272 |
TSNE projection of df3_PCA
from sklearn.manifold import TSNE
tsne_projection3 = TSNE(n_components=2,
perplexity=50,
n_iter=10**4,
early_exaggeration = 12,
init='random',
random_state=42).fit_transform(df3_PCA)
tsne_projection3 = pd.DataFrame(tsne_projection3)
tsne_projection3
| 0 | 1 | |
|---|---|---|
| 0 | -61.287296 | 42.328407 |
| 1 | 12.276224 | -85.403908 |
| 2 | 49.847637 | -3.592204 |
| 3 | -14.563591 | -9.733766 |
| 4 | 3.366188 | 24.789026 |
| ... | ... | ... |
| 8631 | -66.320473 | -2.610976 |
| 8632 | -46.017849 | 18.294941 |
| 8633 | -47.564903 | 14.904220 |
| 8634 | -62.343464 | -9.596377 |
| 8635 | -58.407841 | -3.884658 |
8636 rows × 2 columns
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,10))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = cluster_model.fit(df3_PCA).labels_
silhouette_avg = silhouette_score(df3_PCA, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea246fee0>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection3[0],
y=tsne_projection3[1],
text=tsne_projection.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Also here the data is not well clustered
Log Transformation + Standard Scaler + PCA
df3 = df.copy()
df3.dropna(inplace=True)
df3.drop('CUST_ID', axis=1, inplace=True)
for col in df3.columns:
df3[col] = np.log1p(df3[col])
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit_transform(df3)
df_scaled = pd.DataFrame(scaled_data, columns = df3.columns)
from sklearn.decomposition import PCA
pca3 = PCA(n_components=0.95)
Res_PCA3 = pca2.fit_transform(df_scaled)
df3_PCA = pd.DataFrame(Res_PCA3)
df3_PCA.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.230727 | 2.937858 | -0.424505 | -1.127657 | 0.102236 | 0.566265 | -0.450799 | -0.026388 | -0.155797 | -0.331000 |
| 1 | -3.519006 | -0.990269 | 0.235588 | 0.446515 | -1.686637 | -1.021051 | 0.251739 | -0.356291 | -0.162363 | 0.375003 |
| 2 | 1.404060 | -1.235079 | 2.384736 | -1.592585 | 0.797374 | -0.410037 | 0.247088 | 0.851161 | 0.316085 | 1.088526 |
| 3 | -1.161154 | 1.477582 | 0.732243 | -1.910159 | 0.133853 | -0.530514 | -0.508603 | -0.526374 | -0.601995 | -0.267992 |
| 4 | 1.125630 | -0.364839 | -1.829226 | -1.729788 | -0.598401 | -0.050756 | 0.518284 | -1.246441 | 0.343714 | -0.171873 |
| 5 | 4.602633 | -1.814458 | 1.553204 | 1.429255 | -0.890360 | -1.650877 | -0.280104 | 0.261162 | -0.097927 | 0.362230 |
| 6 | 1.589775 | 0.138779 | -2.085167 | -1.362226 | -0.343388 | 0.125271 | 0.417648 | -0.198346 | -0.266940 | 0.179893 |
| 7 | 0.980549 | -0.087691 | 0.593448 | -1.252197 | -0.298159 | 0.038363 | 0.878634 | 0.521098 | -0.265262 | -1.150412 |
| 8 | 0.335677 | 1.156842 | 2.831561 | -0.112430 | -0.669042 | 0.897779 | 1.276683 | 0.313038 | -0.345058 | -0.909823 |
| 9 | 1.639603 | -0.210504 | -2.436002 | -1.672329 | -0.282272 | 0.151602 | 0.157040 | -1.241441 | 0.324925 | 0.208676 |
from sklearn.manifold import TSNE
tsne_projection3 = TSNE(n_components=2,
perplexity=50,
n_iter=10**4,
early_exaggeration = 12,
init='random',
random_state=42).fit_transform(df3_PCA)
tsne_projection3 = pd.DataFrame(tsne_projection3)
tsne_projection3
| 0 | 1 | |
|---|---|---|
| 0 | 44.148529 | 39.466328 |
| 1 | -3.533186 | -91.208305 |
| 2 | -20.178785 | 14.214781 |
| 3 | -4.691086 | -7.307813 |
| 4 | 40.936478 | 62.643265 |
| ... | ... | ... |
| 8631 | 2.741818 | -29.601465 |
| 8632 | 97.680107 | 29.017950 |
| 8633 | 96.088524 | 26.232590 |
| 8634 | 6.259691 | -56.035103 |
| 8635 | -12.270628 | -33.394211 |
8636 rows × 2 columns
from sklearn.metrics import silhouette_score
import numpy as np
range_n_cluster = list(range(2,10))
silhoutte_score = []
best_cluster_model = None
for n_clusters in range_n_cluster:
cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = cluster_model.fit(df3_PCA).labels_
silhouette_avg = silhouette_score(df3_PCA, cluster_labels)
silhoutte_score += [silhouette_avg]
if silhouette_avg >= np.max(silhoutte_score):
best_cluster_model = cluster_model
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')
<matplotlib.lines.Line2D at 0x19ea3f4a910>
cluster_labels = ["cluster" + str(label)
for label in best_cluster_model.labels_]
import plotly.express as px
fig = px.scatter(x=tsne_projection3[0],
y=tsne_projection3[1],
text=tsne_projection3.index,
color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()
Bad performance also with this mixture of preprocessing + KMeans
I find that the best choice in this notebook is for Log Transformation after removing rows with null + PCA